{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>\n",
    "优酷频道：http://i.youku.com/sdxxqbf<br>\n",
    "微信公众号：深度学习与神经网络<br>\n",
    "Github：https://github.com/Qinbf<br>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import os\n",
    "import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import math\n",
    "from tqdm import tqdm\n",
    "from six.moves import xrange"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Parameters:\n",
      "BATCH_SIZE=64\n",
      "CHECKPOINT_EVERY=200\n",
      "DATA_FILE=./ieee_zhihu_cup/data_topic_block_0.txt\n",
      "DEV_SAMPLE_PERCENTAGE=0.1\n",
      "DROPOUT_KEEP_PROB=0.5\n",
      "EMBEDDING_DIM=256\n",
      "EVALUATE_EVERY=50\n",
      "FILTER_SIZES=3,4,5\n",
      "L2_REG_LAMBDA=0.0005\n",
      "NUM_CHECKPOINTS=5\n",
      "NUM_EPOCHS=10\n",
      "NUM_FILTERS=1024\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Parameters\n",
    "# ==================================================\n",
    "\n",
    "# Data loading params\n",
    "# validation数据集占比\n",
    "tf.flags.DEFINE_float(\"dev_sample_percentage\", .1, \"Percentage of the training data to use for validation\")\n",
    "# 数据集\n",
    "tf.flags.DEFINE_string(\"data_file\", \"./ieee_zhihu_cup/data_topic_block_0.txt\", \"Data source for the positive data.\")\n",
    "\n",
    "# Model Hyperparameters\n",
    "# 词向量长度\n",
    "tf.flags.DEFINE_integer(\"embedding_dim\", 256, \"Dimensionality of character embedding (default: 256)\")\n",
    "# 卷积核大小\n",
    "tf.flags.DEFINE_string(\"filter_sizes\", \"3,4,5\", \"Comma-separated filter sizes (default: '3,4,5')\")\n",
    "# 每一种卷积核个数\n",
    "tf.flags.DEFINE_integer(\"num_filters\", 1024, \"Number of filters per filter size (default: 1024)\")\n",
    "# dropout参数\n",
    "tf.flags.DEFINE_float(\"dropout_keep_prob\", 0.5, \"Dropout keep probability (default: 0.5)\")\n",
    "# l2正则化参数\n",
    "tf.flags.DEFINE_float(\"l2_reg_lambda\", 0.0005, \"L2 regularization lambda (default: 0.0005)\")\n",
    "\n",
    "# Training parameters\n",
    "# 批次大小\n",
    "tf.flags.DEFINE_integer(\"batch_size\", 64, \"Batch Size (default: 64)\")\n",
    "# 迭代周期\n",
    "tf.flags.DEFINE_integer(\"num_epochs\", 10, \"Number of training epochs (default: 10)\")\n",
    "# 多少step测试一次\n",
    "tf.flags.DEFINE_integer(\"evaluate_every\", 50, \"Evaluate model on dev set after this many steps (default: 50)\")\n",
    "# 多少step保存一次模型\n",
    "tf.flags.DEFINE_integer(\"checkpoint_every\", 200, \"Save model after this many steps (default: 200)\")\n",
    "# 保存多少个模型\n",
    "tf.flags.DEFINE_integer(\"num_checkpoints\", 5, \"Number of checkpoints to store (default: 5)\")\n",
    "\n",
    "# flags解析\n",
    "FLAGS = tf.flags.FLAGS\n",
    "FLAGS._parse_flags()\n",
    "\n",
    "# 打印所有参数\n",
    "print(\"\\nParameters:\")\n",
    "for attr, value in sorted(FLAGS.__flags.items()):\n",
    "    print(\"{}={}\".format(attr.upper(), value))\n",
    "print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [01:15<00:00, 3959.17it/s]\n"
     ]
    }
   ],
   "source": [
    "y = []\n",
    "x_text = []\n",
    "\n",
    "# 读取训练数据和标签\n",
    "reader = pd.read_table(FLAGS.data_file,sep='\\t',header=None)\n",
    "for i in tqdm(xrange(reader.shape[0])):\n",
    "    # 按','切分标签\n",
    "    temp = reader.iloc[i][1].split(',')\n",
    "    # 如果分类数大于5，只取前5个分类\n",
    "    if (len(temp)>5):\n",
    "        temp = temp[0:5]\n",
    "    # 设置标签的对应位置为1，其余位置为0\n",
    "    label = np.zeros(1999)\n",
    "    for temp_label in temp:\n",
    "        label[int(temp_label)] = 1\n",
    "    y.append(label)\n",
    "    x_text.append(reader.iloc[i][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w31389,w6,w1019,w69288,w111,w3332,w109,w11,w25,w1110,w111', 'w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w875,w3352,w500,w21790,w12144,w111', 'w875,w15450,w42394,w15863,w6,w95421,w25,w803,w346,w6,w3763,w347,w88,w111', 'w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w140344,w111,w112,w49270,w2129,w6,w6978,w359,w10147,w111', 'w380,w54,w674,w133,w54,w134,w614,w54,w929,w307,w109,w110,w19045,w6,w5830,w111']\n",
      "[[ 0.  1.  0. ...,  0.  0.  0.]\n",
      " [ 0.  0.  0. ...,  0.  0.  0.]\n",
      " [ 0.  0.  0. ...,  0.  0.  0.]\n",
      " [ 0.  0.  0. ...,  0.  0.  0.]\n",
      " [ 0.  0.  0. ...,  0.  0.  0.]]\n"
     ]
    }
   ],
   "source": [
    "# 打印x_text和y的前5行\n",
    "print(x_text[0:5])\n",
    "y = np.array(y, dtype = np.float32)\n",
    "print(y[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x_shape: (300000, 72)\n",
      "y_shape: (300000, 1999)\n",
      "Vocabulary Size: 131900\n",
      "Train/Dev split: 270000/30000\n",
      "x: [[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15  4 16 17 13  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [18 19 20 21 22 19 23 10 24 25 26 27 28 29 13  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [25 30 31 32 10 33 16 34 35 10 36 37 38 13  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [39 40 41 42 19 43 19 44 45 46 13 47 48 49 10 50 51 52 13  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [53 19 54 55 19 56 57 19 58 59 15 45 60 10 61 13  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]\n",
      "y: [[ 0.  1.  0. ...,  0.  0.  0.]\n",
      " [ 0.  0.  0. ...,  0.  0.  0.]\n",
      " [ 0.  0.  0. ...,  0.  0.  0.]\n",
      " [ 0.  0.  0. ...,  0.  0.  0.]\n",
      " [ 0.  0.  0. ...,  0.  0.  0.]]\n"
     ]
    }
   ],
   "source": [
    "# Build vocabulary\n",
    "# 计算一段文本中最多的词汇数\n",
    "max_document_length = max([len(x.split(\",\")) for x in x_text])\n",
    "vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length)\n",
    "\n",
    "x = np.array(list(vocab_processor.fit_transform(x_text)))\n",
    "print(\"x_shape:\",x.shape)\n",
    "print(\"y_shape:\",y.shape)\n",
    "\n",
    "# 保存字典\n",
    "vocab_processor.save(\"vocab_dict\")\n",
    "\n",
    "# Split train/test set\n",
    "# 数据集切分为两部分，训练集和验证集\n",
    "dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))\n",
    "x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]\n",
    "y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]\n",
    "\n",
    "print(\"Vocabulary Size: {:d}\".format(len(vocab_processor.vocabulary_)))\n",
    "print(\"Train/Dev split: {:d}/{:d}\".format(len(y_train), len(y_dev)))\n",
    "print(\"x:\",x_train[0:5])\n",
    "print(\"y:\",y_train[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num_filters_total: 3072\n"
     ]
    }
   ],
   "source": [
    "# 定义三个placeholder\n",
    "input_x = tf.placeholder(tf.int32, [None, x_train.shape[1]], name=\"input_x\")\n",
    "input_y = tf.placeholder(tf.float32, [None, y_train.shape[1]], name=\"input_y\")\n",
    "dropout_keep_prob = tf.placeholder(tf.float32, name=\"dropout_keep_prob\")\n",
    "\n",
    "# sequence_length-最长词汇数\n",
    "sequence_length=x_train.shape[1]\n",
    "# num_classes-分类数\n",
    "num_classes=y_train.shape[1]\n",
    "# vocab_size-总词汇数\n",
    "vocab_size=len(vocab_processor.vocabulary_)\n",
    "# embedding_size-词向量长度\n",
    "embedding_size=FLAGS.embedding_dim\n",
    "# filter_sizes-卷积核尺寸3，4，5\n",
    "filter_sizes=list(map(int, FLAGS.filter_sizes.split(\",\")))\n",
    "# num_filters-卷积核数量\n",
    "num_filters=FLAGS.num_filters\n",
    "        \n",
    "Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name=\"Weights\")\n",
    "# shape:[None, sequence_length, embedding_size]\n",
    "embedded_chars = tf.nn.embedding_lookup(Weights, input_x)\n",
    "# 添加一个维度，shape:[None, sequence_length, embedding_size, 1]\n",
    "embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)\n",
    "\n",
    "# Create a convolution + maxpool layer for each filter size\n",
    "pooled_outputs = []\n",
    "for i, filter_size in enumerate(filter_sizes):\n",
    "    with tf.name_scope(\"conv-maxpool-%s\" % filter_size):\n",
    "        # Convolution Layer\n",
    "        filter_shape = [filter_size, embedding_size, 1, num_filters]\n",
    "        W = tf.Variable(\n",
    "            tf.truncated_normal(filter_shape, stddev=0.1), name=\"W\")\n",
    "        b = tf.Variable(\n",
    "            tf.constant(0.1, shape=[num_filters]), name=\"b\")\n",
    "        conv = tf.nn.conv2d(\n",
    "            embedded_chars_expanded,\n",
    "            W,\n",
    "            strides=[1, 1, 1, 1],\n",
    "            padding=\"VALID\",\n",
    "            name=\"conv\")\n",
    "        # Apply nonlinearity\n",
    "        h = tf.nn.relu(tf.nn.bias_add(conv, b), name=\"relu\")\n",
    "        # Maxpooling over the outputs\n",
    "        pooled = tf.nn.max_pool(\n",
    "            h,\n",
    "            ksize=[1, sequence_length - filter_size + 1, 1, 1],\n",
    "            strides=[1, 1, 1, 1],\n",
    "            padding='VALID',\n",
    "            name=\"pool\")\n",
    "        pooled_outputs.append(pooled)\n",
    "\n",
    "# Combine all the pooled features\n",
    "num_filters_total = num_filters * len(filter_sizes)\n",
    "print(\"num_filters_total:\", num_filters_total)\n",
    "h_pool = tf.concat(pooled_outputs, 3)\n",
    "h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])\n",
    "\n",
    "# Add dropout\n",
    "with tf.name_scope(\"dropout\"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)\n",
    "\n",
    "# Final (unnormalized) scores and predictions\n",
    "with tf.name_scope(\"output\"):\n",
    "    W = tf.get_variable(\n",
    "        \"W\",\n",
    "        shape=[num_filters_total, num_classes],\n",
    "        initializer=tf.contrib.layers.xavier_initializer())\n",
    "    b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name=\"b\")\n",
    "    scores = tf.nn.xw_plus_b(h_drop, W, b, name=\"scores\")\n",
    "    \n",
    "# 定义loss\n",
    "with tf.name_scope(\"loss\"):\n",
    "    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=scores, labels=input_y))\n",
    "\n",
    "# 定义优化器\n",
    "with tf.name_scope(\"optimizer\"):\n",
    "    optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 生成批次数据\n",
    "def batch_iter(data, batch_size, num_epochs, shuffle=False):\n",
    "    \"\"\"\n",
    "    Generates a batch iterator for a dataset.\n",
    "    \"\"\"\n",
    "    data = np.array(data)\n",
    "    data_size = len(data)\n",
    "    # 每个epoch的num_batch\n",
    "    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1\n",
    "    print(\"num_batches_per_epoch:\",num_batches_per_epoch)\n",
    "    for epoch in range(num_epochs):\n",
    "        # Shuffle the data at each epoch\n",
    "        if shuffle:\n",
    "            shuffle_indices = np.random.permutation(np.arange(data_size))\n",
    "            shuffled_data = data[shuffle_indices]\n",
    "        else:\n",
    "            shuffled_data = data\n",
    "        for batch_num in range(num_batches_per_epoch):\n",
    "            start_index = batch_num * batch_size\n",
    "            end_index = min((batch_num + 1) * batch_size, data_size)\n",
    "            yield shuffled_data[start_index:end_index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 知乎提供的评测方案\n",
    "def eval(predict_label_and_marked_label_list):\n",
    "    \"\"\"\n",
    "    :param predict_label_and_marked_label_list: 一个元组列表。例如\n",
    "    [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),\n",
    "      ([3, 2, 1, 4, 7], [5, 7, 3])\n",
    "     ]\n",
    "    需要注意这里 predict_label 是去重复的，例如 [1,2,3,2,4,1,6]，去重后变成[1,2,3,4,6]\n",
    "    \n",
    "    marked_label_list 本身没有顺序性，但提交结果有，例如上例的命中情况分别为\n",
    "    [0，0，0，1，1]   (4，5命中)\n",
    "    [1，0，0，0，1]   (3，7命中)\n",
    "\n",
    "    \"\"\"\n",
    "    right_label_num = 0  #总命中标签数量\n",
    "    right_label_at_pos_num = [0, 0, 0, 0, 0]  #在各个位置上总命中数量\n",
    "    sample_num = 0   #总问题数量\n",
    "    all_marked_label_num = 0    #总标签数量\n",
    "    for predict_labels, marked_labels in predict_label_and_marked_label_list:\n",
    "        sample_num += 1\n",
    "        marked_label_set = set(marked_labels)\n",
    "        all_marked_label_num += len(marked_label_set)\n",
    "        for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):\n",
    "            if label in marked_label_set:     #命中\n",
    "                right_label_num += 1\n",
    "                right_label_at_pos_num[pos] += 1\n",
    "\n",
    "    precision = 0.0\n",
    "    for pos, right_num in zip(range(0, 5), right_label_at_pos_num):\n",
    "        precision += ((right_num / float(sample_num))) / math.log(2.0 + pos)  # 下标0-4 映射到 pos1-5 + 1，所以最终+2\n",
    "    recall = float(right_label_num) / all_marked_label_num\n",
    "\n",
    "    return 2*(precision * recall) / (precision + recall )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num_batches_per_epoch: 4219\n"
     ]
    }
   ],
   "source": [
    "# 定义saver，只保存最新的5个模型\n",
    "saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)\n",
    "\n",
    "with tf.Session() as sess:\n",
    "    predict_top_5 = tf.nn.top_k(scores, k=5)\n",
    "    label_top_5 = tf.nn.top_k(input_y, k=5) \n",
    "    sess.run(tf.global_variables_initializer())\n",
    "    i = 0\n",
    "    # 生成数据\n",
    "    batches = batch_iter(\n",
    "        list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)\n",
    "    for batch in batches:\n",
    "        i = i + 1\n",
    "        # 得到一个batch的数据\n",
    "        x_batch, y_batch = zip(*batch)\n",
    "        # 优化模型\n",
    "        sess.run([optimizer],feed_dict={input_x:x_batch, input_y:y_batch, dropout_keep_prob:FLAGS.dropout_keep_prob})\n",
    "\n",
    "        # 每训练50次测试1次\n",
    "        if (i % FLAGS.evaluate_every == 0):\n",
    "            print (\"Evaluation:step\",i)\n",
    "            predict_5, label_5, _loss = sess.run([predict_top_5,label_top_5,loss],feed_dict={input_x:x_batch,\n",
    "                                                                                      input_y:y_batch,\n",
    "                                                                                      dropout_keep_prob:1.0})\n",
    "            print (\"label:\",label_5[1][:5])\n",
    "            print (\"predict:\",predict_5[1][:5])\n",
    "            print (\"predict:\",predict_5[0][:5])\n",
    "            print (\"loss:\",_loss)\n",
    "            predict_label_and_marked_label_list = []\n",
    "            for predict,label in zip(predict_5[1],label_5[1]):\n",
    "                predict_label_and_marked_label_list.append((list(predict),list(label)))\n",
    "            score = eval(predict_label_and_marked_label_list)\n",
    "            print(\"score:\",score)\n",
    "\n",
    "        # 每训练200次保存1次模型\n",
    "        if (i % FLAGS.checkpoint_every == 0):\n",
    "            path = saver.save(sess, \"models/model\", global_step=i)\n",
    "            print(\"Saved model checkpoint to {}\".format(path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
